package test;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import test.hibernate.DAOFactory;
import test.hibernate.dao.HibernateUtil;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.url.WebURL;

public class MyCrawler extends WebCrawler {

	private Logger LOG = Logger.getLogger(MyCrawler.class);

	Pattern filters = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g"
			+ "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf"
			+ "|rm|smil|wmv|swf|wma|zip|rar|gz))$");

	private PageHandler handler;

	/*
	 * You should implement this function to specify whether the given URL
	 * should be visited or not.
	 */
	public boolean shouldVisit(WebURL url) {
		LOG.info("URL: " + url);
		String href = url.getURL().toLowerCase();
		if (filters.matcher(href).matches()) {
			return false;
		}
		if (href.contains("zk.com.mk") && href.matches(".*/\\d+")) {
			LOG.info("WILL VISIT URL: " + url);
			return true;
		}
		return false;
	}

	/*
	 * This function is called when a page is fetched and ready to be processed
	 * by your program
	 */
	public void visit(Page page) {
		handler = new PageHandler();
		// int docid = page.getWebURL().getDocid();
		// int doc = docid;
		String url = page.getWebURL().getURL();
		LOG.info("Scanning page: " + url);
		if (!url.toUpperCase().contains("zk.com.mk/Pages".toUpperCase())
				|| url.toUpperCase().endsWith("/mail".toUpperCase())) {
			return;
		}
		handler.handlePage(page);
		// System.out.println("Sinisa"+url);
		// System.out.println("Sinisa2"+url);
		// String us = url;
		// String text = page.getHTML();
		// System.out.println("Sinisa2"+text);
		// List<WebURL> links = page.getURLs();
	}

}